ITM and elevation – preliminary study.

Descriptive stats

Average elevation across different languages

data <- read.csv('elevation_villages_correct.csv')
data %>% 
  group_by(language) %>% 
  slice(which.min(elevation)) %>% 
  select(eng_vil_name, elevation) %>% 
  write.csv('minimum_vil.csv')
## Adding missing grouping variables: `language`

Average elevation across different languages (grey dots represent different villages:)

ggplot(data=data, aes(x=fct_reorder(language, elevation, .fun=mean, .desc=TRUE), y=elevation))+
  theme_bw()+
  # stat_summary(fun.data = mean_se,  
  #                geom = "errorbar", color='black', alpha=0.3)+
  geom_jitter(color='grey', alpha=0.1)+
  stat_summary(aes(color=Family), fun = "mean", geom = "point", size=2)+
  labs(y = "Elevation", x='Language', color='Family')+
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))+
  theme(legend.position = "top")+
  scale_color_viridis(discrete = TRUE, option = "D")+
  coord_flip()+
  theme(axis.text.x=element_text(angle=0, hjust=0.5),
        # text = element_text(family = "Andale Mono"),
        legend.position = 'bottom')+
  geom_hline(aes(yintercept=mean(elevation)), color='red', linetype='dotted')+
  guides(colour = guide_legend(nrow = 1))

ggsave('elev_lang.jpg', width = 8, height = 5)

How the population size correlates with elevation? Well, there seems to be a slight decrease with higher elevations (although we cannot trust those regressions).

data$log_c <- 0
data$log_c <- log(data$census_1926)

data$el_100 <- plyr::round_any(data$elevation, 1, f = ceiling)

data %>%
  group_by(el_100) %>%
  summarise(mean_c = mean(log_c)) %>%
  ggplot(aes(x=el_100, y=mean_c))+
    geom_point(alpha=0.4)+
    geom_smooth(method = 'lm', color='red')+
    theme_bw()+
    labs(y = "Mean population size", x='Elevation')
## `summarise()` ungrouping output (override with `.groups` argument)
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 2 rows containing non-finite values (stat_smooth).
## Warning: Removed 2 rows containing missing values (geom_point).

Average elevation across different languages (groupped by family)

ggplot(data=data, aes(x=fct_reorder(language, elevation, .fun=mean, .desc=TRUE), y=elevation))+
  theme_bw()+
  # geom_boxplot(aes(fill=Family))+
  geom_dotplot(aes(fill=Family, color=Family), binaxis = "y", stackdir = "center", position = "dodge", binwidth = 60, method = 'histodot')+
  facet_wrap(vars(Family), scales = "free_y", strip.position = 'top')+
  labs(y = "Elevation", x='Language', size=' 1926 Census\n data')+
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))+
  theme(legend.position = "top")+
  scale_color_viridis(discrete = TRUE, option = "D")+
  coord_flip()+
  theme(legend.position = "none",
        legend.key = element_blank(), 
        strip.text = element_text(face="bold"), 
        axis.text.x=element_text(angle=0, hjust=0.5)) 

ggsave('elev_lang.jpg', width = 8, height = 5)
data[data$language == 'Godoberi ',] %>% 
  write.csv('godoberi.csv')

Average elevation across families (groupped by family)

ggplot(data, aes(y=elevation, x=Family))+
  theme_bw()+
  geom_dotplot(aes(fill=Family, color=Family), binaxis = "y", stackdir = "center", position = "dodge", binwidth = 41, method = 'histodot')+
  theme(legend.position = "none",
        legend.key = element_blank(), 
        strip.text = element_text(face="bold"), 
        axis.text.x=element_text(angle=0, hjust=0.5))+
  # stat_summary(fun = "mean", geom = "crossbar", size=0.5, alpha=0.1, color='grey')+
  # geom_hline(aes(yintercept=mean(elevation)), color='red', linetype='dotted')

ggsave('family_dist.jpg', width = 8, height = 5)

Generalized Linear Models

all <- read.csv('all.csv')
kable(head(all))
X residence expedition name number.of.na russian.na sex type year_of_birth year.of.death аварский агульский азербайджанский азербайджанский.или.кумыкский акушинский.даргинский андийский арчинский ахвахский багвалинский бежтинский ботлихский гапшиминский.даргинский гдымско.фийский.лезгинский гинухский грузинский даргинский кадарский.даргинский кайтагский.даргинский каратинский кубачинский.даргинский кумыкский лакский лезгинский мегебский муиринский.даргинский русский рутульский сирхинский.даргинский табасаранский тукитинский хновский.рутульский цахурский цезский цудахарский.даргинский чеченский чирагский.даргинский mother.tongue village.population language.population number.of.lang mother.tongue.match number.of.lang.strat elevation
0 Balkhar Balkhar, Tsulikana, Shukty, Kuli Иса 0 0 м 0 1885 1965 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0 лакский 1195 31987 0 1 0 1658.484
1 Balkhar Balkhar, Tsulikana, Shukty, Kuli Бет1и 0 0 ж 0 1888 1978 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 лакский 1195 31987 0 1 0 1658.484
2 Balkhar Balkhar, Tsulikana, Shukty, Kuli Абдула 0 0 м 0 1890 1972 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 лакский 1195 31987 2 1 2 1658.484
3 Balkhar Balkhar, Tsulikana, Shukty, Kuli Патимат 0 0 ж 0 1890 1985 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 лакский 1195 31987 0 1 0 1658.484
4 Balkhar Balkhar, Tsulikana, Shukty, Kuli Исмаил 0 0 м 0 1890 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0 лакский 1195 31987 2 1 2 1658.484
5 Balkhar Balkhar, Tsulikana, Shukty, Kuli Жумяъ/Маи 0 0 ж 0 1890 1967 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 лакский 1195 31987 0 1 0 1658.484

Average elevation across different number of L2’s

ggplot(data = all, aes(y=elevation, x=factor(number.of.lang.strat)))+
  theme_bw()+
  geom_violin(fill='grey', color='grey', trim = TRUE)

Poisson and linear regressions

fit_p <- glm(number.of.lang.strat ~ elevation + year_of_birth*sex + language.population*residence, family="poisson", data=all)
fit_lin <- glm(number.of.lang.strat ~ elevation + year_of_birth*sex + language.population*residence, data=all)

Linear:

ggplot(ggpredict(fit_lin, terms = c('elevation')), aes(x, predicted)) +
  geom_line()+
  geom_ribbon(aes(ymin = conf.low, ymax = conf.high), alpha = .1)+
  theme_bw()

  # facet_wrap(~facet)

Poisson:

ggplot(ggpredict(fit_p, terms = c('elevation')), aes(x, predicted)) +
  geom_line()+
  geom_ribbon(aes(ymin = conf.low, ymax = conf.high), alpha = .1)+
  theme_bw()

  # facet_wrap(~facet)
ITM <- read.csv('ITM.csv')
all <- read.csv('all.csv')
all_subset <- all[all$year_of_birth < 1930,]

all_subset$norm_yb <- scale(all_subset$year_of_birth)
all_subset$norm_el <- scale(all_subset$elevation)
all_subset$norm_itm <- scale(all_subset$number.of.lang.strat)
all_subset$norm_pop <- scale(all_subset$language.population)

all$norm_yb <- scale(all$year_of_birth)
all$norm_el <- scale(all$elevation)
all$norm_itm <- scale(all$number.of.lang.strat)
all$norm_pop <- scale(all$language.population)
all$norm_vil_pop <- scale(all$village.population)

pois_2 <- glmer(number.of.lang.strat ~ norm_el + (1|norm_yb:sex) + (1|residence:mother.tongue), data=all_subset, family='poisson')
summary(pois_2)
## Generalized linear mixed model fit by maximum likelihood (Laplace
##   Approximation) [glmerMod]
##  Family: poisson  ( log )
## Formula: 
## number.of.lang.strat ~ norm_el + (1 | norm_yb:sex) + (1 | residence:mother.tongue)
##    Data: all_subset
## 
##      AIC      BIC   logLik deviance df.resid 
##   4301.6   4323.8  -2146.8   4293.6     1921 
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -1.4442 -0.4142 -0.0894  0.2961  3.6068 
## 
## Random effects:
##  Groups                  Name        Variance Std.Dev.
##  norm_yb:sex             (Intercept) 0.0134   0.1157  
##  residence:mother.tongue (Intercept) 0.5030   0.7092  
## Number of obs: 1925, groups:  norm_yb:sex, 139; residence:mother.tongue, 50
## 
## Fixed effects:
##             Estimate Std. Error z value Pr(>|z|)   
## (Intercept)  -0.2453     0.1073  -2.286   0.0223 * 
## norm_el       0.3285     0.1068   3.076   0.0021 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##         (Intr)
## norm_el -0.017

Garik M. model:

garik_model <- glmer(number.of.lang.strat ~ sex + norm_pop + (1|norm_yb) + (1|residence), data=all, family='poisson')
summary(garik_model)
## Generalized linear mixed model fit by maximum likelihood (Laplace
##   Approximation) [glmerMod]
##  Family: poisson  ( log )
## Formula: number.of.lang.strat ~ sex + norm_pop + (1 | norm_yb) + (1 |  
##     residence)
##    Data: all
## 
##      AIC      BIC   logLik deviance df.resid 
##  12671.9  12705.8  -6330.9  12661.9     6473 
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -1.2651 -0.5178 -0.1423  0.3334  4.8590 
## 
## Random effects:
##  Groups    Name        Variance Std.Dev.
##  norm_yb   (Intercept) 0.1061   0.3257  
##  residence (Intercept) 0.7807   0.8836  
## Number of obs: 6478, groups:  norm_yb, 158; residence, 50
## 
## Fixed effects:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -0.75257    0.13259  -5.676 1.38e-08 ***
## sexм         0.24847    0.02819   8.814  < 2e-16 ***
## norm_pop    -0.29958    0.11690  -2.563   0.0104 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##          (Intr) sexм  
## sexм     -0.124       
## norm_pop -0.080 -0.001

Normalized elevation vs. normalized village population.

ggplot(data=all, aes(x=norm_el, y=norm_vil_pop))+
  geom_point()

pois_2_pop <- glmer(number.of.lang ~ norm_el + (1|norm_yb) + (1|sex) + (1|residence) + norm_pop + norm_vil_pop, data=all, family='poisson')
summary(pois_2_pop)
## Generalized linear mixed model fit by maximum likelihood (Laplace
##   Approximation) [glmerMod]
##  Family: poisson  ( log )
## Formula: 
## number.of.lang ~ norm_el + (1 | norm_yb) + (1 | sex) + (1 | residence) +  
##     norm_pop + norm_vil_pop
##    Data: all
## 
##      AIC      BIC   logLik deviance df.resid 
##  12659.3  12706.8  -6322.7  12645.3     6471 
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -1.2613 -0.5227 -0.1316  0.3337  4.8098 
## 
## Random effects:
##  Groups    Name        Variance Std.Dev.
##  norm_yb   (Intercept) 0.10649  0.3263  
##  residence (Intercept) 0.47514  0.6893  
##  sex       (Intercept) 0.02003  0.1415  
## Number of obs: 6478, groups:  norm_yb, 158; residence, 50; sex, 2
## 
## Fixed effects:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  -0.67170    0.14619  -4.595 4.33e-06 ***
## norm_el       0.28415    0.10318   2.754  0.00589 ** 
## norm_pop     -0.30435    0.09433  -3.227  0.00125 ** 
## norm_vil_pop -0.50435    0.11194  -4.506 6.62e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##             (Intr) norm_l nrm_pp
## norm_el     -0.034              
## norm_pop    -0.041  0.138       
## norm_vil_pp  0.094  0.095  0.125
ITM$norm_yb <- scale(ITM$year_of_birth)
ITM$norm_el <- scale(ITM$elevation)
ITM$norm_itm <- scale(ITM$number.of.lang.strat)
pois_1 <- glmer(number.of.lang.strat ~ norm_el + (1|norm_yb:sex) + (1|residence:mother.tongue), data=ITM, family='poisson')
summary(pois_1)
## Generalized linear mixed model fit by maximum likelihood (Laplace
##   Approximation) [glmerMod]
##  Family: poisson  ( log )
## Formula: 
## number.of.lang.strat ~ norm_el + (1 | norm_yb:sex) + (1 | residence:mother.tongue)
##    Data: ITM
## 
##      AIC      BIC   logLik deviance df.resid 
##   8494.0   8519.5  -4243.0   8486.0     4323 
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -1.2029 -0.4526 -0.1267  0.2698  3.8511 
## 
## Random effects:
##  Groups                  Name        Variance Std.Dev.
##  norm_yb:sex             (Intercept) 0.04944  0.2223  
##  residence:mother.tongue (Intercept) 0.74412  0.8626  
## Number of obs: 4327, groups:  norm_yb:sex, 118; residence:mother.tongue, 50
## 
## Fixed effects:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  -0.6311     0.1281  -4.928 8.33e-07 ***
## norm_el       0.3873     0.1281   3.023  0.00251 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##         (Intr)
## norm_el -0.059

Poisson regression vs. real data

pois_1 <- glmer(number.of.lang.strat ~ norm_el + (1|norm_yb:sex) + (1|residence:mother.tongue), data=ITM, family='poisson')
summary(pois_1)
## Generalized linear mixed model fit by maximum likelihood (Laplace
##   Approximation) [glmerMod]
##  Family: poisson  ( log )
## Formula: 
## number.of.lang.strat ~ norm_el + (1 | norm_yb:sex) + (1 | residence:mother.tongue)
##    Data: ITM
## 
##      AIC      BIC   logLik deviance df.resid 
##   8494.0   8519.5  -4243.0   8486.0     4323 
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -1.2029 -0.4526 -0.1267  0.2698  3.8511 
## 
## Random effects:
##  Groups                  Name        Variance Std.Dev.
##  norm_yb:sex             (Intercept) 0.04944  0.2223  
##  residence:mother.tongue (Intercept) 0.74412  0.8626  
## Number of obs: 4327, groups:  norm_yb:sex, 118; residence:mother.tongue, 50
## 
## Fixed effects:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  -0.6311     0.1281  -4.928 8.33e-07 ***
## norm_el       0.3873     0.1281   3.023  0.00251 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##         (Intr)
## norm_el -0.059
AIC(pois_1)
## [1] 8493.972
ITM_el <- dplyr::select(ITM, elevation, number.of.lang.strat) %>% 
  group_by(elevation) %>% 
  summarise(mean_itm = mean(number.of.lang.strat), count=n())
## `summarise()` ungrouping output (override with `.groups` argument)
kable(head(ITM_el))
elevation mean_itm count
355.0269 0.0000000 94
365.5033 1.0689655 145
407.0000 0.0973451 113
450.7447 0.5662651 83
491.3748 0.8709677 31
667.3964 0.0000000 66
pred <- ggpredict(pois_1, terms = c('norm_el'))
true_sequence <- seq(min(ITM$elevation), max(ITM$elevation), length.out=21)
ggplot()+
  geom_point(data=ITM_el, aes(x=elevation, y=mean_itm, size=count), color='blue', alpha=.5)+
  geom_line(data=pred, aes(x=true_sequence, y=predicted), size=0.8)+
  geom_ribbon(aes(ymin = pred$conf.low, ymax = pred$conf.high, x=true_sequence), alpha = .15)+
  theme_bw()+
  labs(x = "Elevation", y='Average ITM', size=' Number\n of observations')

ggsave('poison_observ.jpg', width = 7, height = 4)

Case stydy: languages spoken at the highest attitudes:

Hypothesis: isolation is not the primary source of language diversity.

all$rounded.yb <- plyr::round_any(all$year_of_birth, 5, f = ceiling)
  L2 <- all[all$mother.tongue != 'рутульский' & all$'рутульский' == 1,] %>%
    group_by(rounded.yb) %>%
    mutate(mean_itm = mean(рутульский), count = n(), itm = mean(number.of.lang)) 
  L1 <- all[all$mother.tongue == 'рутульский',] %>%
    group_by(rounded.yb) %>%
    mutate(mean_itm = mean(рутульский), count = n(), itm = mean(number.of.lang)) 
  ggplot()+
    theme_bw()+
    geom_point(data = L2, aes(x=rounded.yb, y=count, colour='L2', size=itm))+
    geom_point(data = L1, aes(x=rounded.yb, y=count, colour='L1', size=itm))+
    labs(y = "Number of speakers", x= 'Year of birth', colour='Type')+
    ggtitle('Rutul knowledge as L1 or L2')

  L2 <- all[all$mother.tongue != 'аварский' & all$'аварский' == 1,] %>%
    group_by(rounded.yb) %>%
    mutate(mean_itm = mean(аварский), count = n(), itm = mean(number.of.lang)) 
  L1 <- all[all$mother.tongue == 'аварский',] %>%
    group_by(rounded.yb) %>%
    mutate(mean_itm = mean(аварский), count = n(), itm = mean(number.of.lang)) 
  ggplot()+
    theme_bw()+
    geom_point(data = L2, aes(x=rounded.yb, y=count, colour='L2', size=itm))+
    geom_point(data = L1, aes(x=rounded.yb, y=count, colour='L1', size=itm))+
    labs(y = "Number of speakers", x= 'Year of birth', colour='Type')+
    ggtitle('Avar knowledge as L1 or L2')

  L2 <- all[all$mother.tongue != 'лакский' & all$'лакский' == 1,] %>%
    group_by(rounded.yb) %>%
    mutate(mean_itm = mean(лакский), count = n(), itm = mean(number.of.lang)) 
  L1 <- all[all$mother.tongue == 'лакский',] %>%
    group_by(rounded.yb) %>%
    mutate(mean_itm = mean(лакский), count = n(), itm = mean(number.of.lang)) 
  ggplot()+
    theme_bw()+
    geom_point(data = L2, aes(x=rounded.yb, y=count, colour='L2', size=itm))+
    geom_point(data = L1, aes(x=rounded.yb, y=count, colour='L1', size=itm))+
    labs(y = "Number of speakers", x= 'Year of birth', colour='Type')+
     ggtitle('Lak knowledge as L1 or L2')

  L2 <- all[all$mother.tongue != 'азербайджанский' & all$'азербайджанский' == 1,] %>%
    group_by(rounded.yb) %>%
    mutate(mean_itm = mean(азербайджанский), count = n(), itm = mean(number.of.lang)) 
  L1 <- all[all$mother.tongue == 'азербайджанский',] %>%
    group_by(rounded.yb) %>%
    mutate(mean_itm = mean(лакский), count = n(), itm = mean(number.of.lang)) 
  ggplot()+
    theme_bw()+
    geom_point(data = L2, aes(x=rounded.yb, y=count, colour='L2', size=itm))+
    geom_point(data = L1, aes(x=rounded.yb, y=count, colour='L1', size=itm))+
    labs(y = "Number of speakers", x= 'Year of birth', colour='Type')+
     ggtitle('Azerbaijani knowledge as L1 or L2')

all %>%
  group_by(rounded.yb) %>%
  mutate(mean_itm = mean(рутульский), count = n()) %>%
  ggplot(aes(x=rounded.yb, y=mean_itm))+
    geom_point(aes(size=count))+
    geom_smooth(aes(weight = count), method=loess,  color='red')+
    theme_bw()+
    labs(y = "Mean ITM", x= 'Year of birth', size=' Number of \n observations')
## `geom_smooth()` using formula 'y ~ x'

all %>%
  group_by(rounded.yb) %>%
  mutate(mean_itm = mean(аварский), count = n()) %>%
  ggplot(aes(x=rounded.yb, y=mean_itm))+
    geom_point(aes(size=count))+
    geom_smooth(aes(weight = count), method=loess,  color='red')+
    theme_bw()+
    labs(y = "Mean ITM", x= 'Year of birth', size=' Number of \n observations')
## `geom_smooth()` using formula 'y ~ x'

all %>%
  group_by(rounded.yb) %>%
  mutate(mean_itm = mean(лакский), count = n()) %>%
  ggplot(aes(x=rounded.yb, y=mean_itm))+
    geom_point(aes(size=count))+
    geom_smooth(aes(weight = count), method=loess,  color='red')+
    theme_bw()+
    labs(y = "Mean ITM", x= 'Year of birth', size=' Number of \n observations')
## `geom_smooth()` using formula 'y ~ x'

all %>%
  group_by(rounded.yb) %>%
  mutate(mean_itm = mean(сирхинский.даргинский), count = n()) %>%
  ggplot(aes(x=rounded.yb, y=mean_itm))+
    geom_point(aes(size=count))+
    geom_smooth(aes(weight = count), method=loess,  color='red')+
    theme_bw()+
    labs(y = "Mean ITM", x= 'Year of birth', size=' Number of \n observations')
## `geom_smooth()` using formula 'y ~ x'

all %>%
  group_by(mother.tongue) %>%
  summarise(count = n()) %>%
  arrange(count, desc = FALSE)
## `summarise()` ungrouping output (override with `.groups` argument)
## Registered S3 method overwritten by 'cli':
##   method     from    
##   print.boxx spatstat
## # A tibble: 29 x 2
##    mother.tongue              count
##    <fct>                      <int>
##  1 муиринский даргинский         49
##  2 цудахарский даргинский        93
##  3 кумыкский                     97
##  4 чирагский даргинский         101
##  5 гапшиминский даргинский      104
##  6 табасаранский                111
##  7 ахвахский                    112
##  8 лезгинский                   112
##  9 гинухский                    154
## 10 гдымско-фийский лезгинский   157
## # … with 19 more rows

Some geospatial ideas

ITM

itm_coors <- read.csv('itm_coords_no_Hinuq.csv')
itm_coors <- itm_coors[complete.cases(itm_coors), ]
kable(head(itm_coors))
X index Unnamed..0 expedition name number.of.na russian.na sex type year_of_birth year.of.death аварский агульский азербайджанский азербайджанский.или.кумыкский акушинский.даргинский андийский арчинский ахвахский багвалинский бежтинский ботлихский гапшиминский.даргинский гдымско.фийский.лезгинский гинухский грузинский даргинский кадарский.даргинский кайтагский.даргинский каратинский кубачинский.даргинский кумыкский лакский лезгинский мегебский муиринский.даргинский русский рутульский сирхинский.даргинский табасаранский тукитинский хновский.рутульский цахурский цезский цудахарский.даргинский чеченский чирагский.даргинский mother.tongue village.population language.population number.of.lang mother.tongue.match number.of.lang.strat elevation Lat Lon
0 Balkhar 0 Balkhar, Tsulikana, Shukty, Kuli Абедет 0 0 ж 0 1922 2000 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 лакский 1195 31987 0 1 0 1658.484 42.2797 47.2314
1 Balkhar 1 Balkhar, Tsulikana, Shukty, Kuli Абдурахман 0 0 м 0 1923 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0 лакский 1195 31987 0 1 0 1658.484 42.2797 47.2314
2 Balkhar 2 Balkhar, Tsulikana, Shukty, Kuli Магомед 0 0 м 0 1923 1995 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0 лакский 1195 31987 0 1 0 1658.484 42.2797 47.2314
3 Balkhar 3 Balkhar, Tsulikana, Shukty, Kuli Жума 0 0 ж 0 1924 1984 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 лакский 1195 31987 0 1 0 1658.484 42.2797 47.2314
4 Balkhar 4 Balkhar, Tsulikana, Shukty, Kuli Шаммадаев Калла 0 0 м 0 1924 2004 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0 лакский 1195 31987 2 1 2 1658.484 42.2797 47.2314
5 Balkhar 5 Balkhar, Tsulikana, Shukty, Kuli Гусейн 0 0 м 0 1925 2010 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 лакский 1195 31987 0 1 0 1658.484 42.2797 47.2314
itm_vill <- itm_coors %>% 
  group_by(index, Lat, Lon, elevation) %>% 
  summarise(mean_itm = mean(number.of.lang.strat), count=n())
## `summarise()` regrouping output by 'index', 'Lat', 'Lon' (override with `.groups` argument)
kable(head(itm_vill))
index Lat Lon elevation mean_itm count
Balkhar 42.27970 47.23140 1658.4839 0.3296703 91
Bezhta 42.13390 46.12470 1633.0813 1.3636364 143
Burkikhan 41.80636 47.53847 1917.4192 0.7826087 92
Chabanmakhi 42.63784 47.25923 1020.0406 0.6400000 75
Chankurbe 42.63047 47.30022 882.7377 0.5652174 92
Chirag 41.83810 47.43030 2223.1909 0.9718310 71

Case study: highest languages:

Different graph-based ideas

matrix <- distm(itm_vill[,c('Lon','Lat')], itm_vill[,c('Lon','Lat')], fun=distVincentyEllipsoid) / 1000
matrix[matrix < 19] <- 0
mode(matrix) <- "numeric"
rownames(matrix) <- itm_vill$index
colnames(matrix) <- itm_vill$index
net <- graph.adjacency(matrix, mode = "undirected", weighted = TRUE, diag = FALSE)

itm_vill$knn_dist <- knn(net)$knn

itm_vill$degree <- degree(net)

ggplot(itm_vill, aes(x=factor(degree), y=mean_itm, label=index))+
  theme_bw()+
  geom_boxplot()

itm_vill %>% 
  group_by(degree)  %>% 
  summarise(mean = mean(mean_itm), sd = sd(mean_itm),
            n = n())  %>% 
  mutate(se = sd / sqrt(n),
         lower.ci = mean - qt(1 - (0.05 / 2), n - 1) * se,
         upper.ci = mean + qt(1 - (0.05 / 2), n - 1) * se)  %>% 
  ggplot(aes(x=factor(degree), y=mean))+
    geom_point()+
    geom_errorbar(aes(ymin=lower.ci, ymax=upper.ci))
## `summarise()` ungrouping output (override with `.groups` argument)
## Warning in qt(1 - (0.05/2), n - 1): созданы NaN

## Warning in qt(1 - (0.05/2), n - 1): созданы NaN

itm_vill %>% 
  group_by(degree)  %>% 
  summarise(mean = mean(mean_itm), sd = sd(mean_itm),
            n = n())  %>% 
  mutate(se = sd / sqrt(n),
         lower.ci = mean - qt(1 - (0.05 / 2), n - 1) * se,
         upper.ci = mean + qt(1 - (0.05 / 2), n - 1) * se)  %>% 
  ggplot(aes(x=factor(degree), y=mean))+
    geom_point()+
    geom_errorbar(aes(ymin=lower.ci, ymax=upper.ci))
## `summarise()` ungrouping output (override with `.groups` argument)
## Warning in qt(1 - (0.05/2), n - 1): созданы NaN

## Warning in qt(1 - (0.05/2), n - 1): созданы NaN

The villages that are closer than 12km to each-other are interconnected with mean itm and elevation plotted for each village.

matrix <- distm(itm_vill[,c('Lon','Lat')], itm_vill[,c('Lon','Lat')], fun=distVincentyEllipsoid) / 1000
matrix[matrix > 12] <- 0
mode(matrix) <- "numeric"
rownames(matrix) <- itm_vill$index
colnames(matrix) <- itm_vill$index
net <- graph.adjacency(matrix, mode = "undirected", weighted = TRUE, diag = FALSE)

net %>%  
  ggraph(layout = 'kk')+
  geom_edge_link(color = "orange")+
  theme_void()+
  geom_node_point(aes(size=itm_vill$mean_itm, color=itm_vill$elevation)) +  
  geom_node_text(size = 4,
                 aes(label = name), repel=TRUE)+
  labs(size='Mean ITM', color='Elevation')

ggsave('village_graph.jpg', width = 7, height = 4)

Same plot, but mapped to the actual geospatial data.

matrix <- distm(itm_vill[,c('Lon','Lat')], itm_vill[,c('Lon','Lat')], fun=distVincentyEllipsoid) / 1000
matrix[matrix > 12] <- 0
mode(matrix) <- "numeric"
rownames(matrix) <- itm_vill$index
colnames(matrix) <- itm_vill$index
net <- graph.adjacency(matrix, mode = "undirected", weighted = TRUE, diag = FALSE)

ggraph(graph=net, x=itm_vill$Lon, y=itm_vill$Lat)+
  theme_void()+
  geom_edge_link(color = "orange")+
  geom_node_point(aes(color=itm_vill$mean_itm)) +  
  geom_node_text(size = 4,
                 aes(label = name), repel=TRUE)+
  labs(size='Mean ITM', color='Mean ITM')+
  annotation_map(map_data("world"), fill = NA, colour = "grey50")

ggsave('village_graph_map.jpg', width = 7, height = 4)

Families

data_short <- data
matrix_fam <- distm(data_short[,c('Lon','Lat')], data_short[,c('Lon','Lat')], fun=distVincentyEllipsoid) / 1000

All the villages from the bigger dataset interconnected if they are closer than 6kms. Colored by family.

matrix_fam[matrix_fam > 6] <- 0
rownames(matrix_fam) <- data_short$eng_vil_name
colnames(matrix_fam) <- data_short$eng_vil_name
net_fam <- graph.adjacency(matrix_fam, mode = "undirected", weighted = TRUE, diag = FALSE)

net_fam %>%
  # ggraph(layout = 'kk')+
  ggraph(x=data_short$Lon, y=data_short$Lat)+
  geom_edge_link(color = "orange")+
  theme_void()+
  geom_node_point(aes(color=data_short$Family), size=0.5)+
  labs(color='Family')

  # geom_node_text(aes(label = name), repel=TRUE)+
  # ggsave('village_graph_map.pdf', width = 30, height = 30)

The number of neighbours in the graph above with regards to the elevation of the village (elevation is rounded to 100-s). Seems to be no effect.

data$neighbours_6 <- degree(net_fam)
data$log_c <- log(data$census_1926)

data$elevation_round <- plyr::round_any(data$elevation, 100, f = ceiling)

data %>%
  group_by(elevation_round) %>%
  summarise(neighbours = mean(neighbours_6), n_6 = neighbours_6, size=n(), pop = census_1926) %>%
  ggplot(aes(x=elevation_round, y=neighbours))+
    geom_line()+
    geom_point(aes(y=n_6, size=pop), alpha=0.1, color='blue')+
    theme_bw()
## `summarise()` regrouping output by 'elevation_round' (override with `.groups` argument)
## Warning: Removed 2 rows containing missing values (geom_point).

Same graph as with bigger dataset, but with multidag data.

by_lang <- data  %>%  
  group_by(language, Family)  %>%  
  summarise(Lon = mean(Lon), Lat = mean(Lat), n = n())
## `summarise()` regrouping output by 'language' (override with `.groups` argument)
m_l <- distm(by_lang[,c('Lon','Lat')], by_lang[,c('Lon','Lat')], fun=distVincentyEllipsoid) / 1000
m_l[m_l > 25] <- 0
rownames(m_l) <- by_lang$language
colnames(m_l) <- by_lang$language
net_l <- graph.adjacency(m_l, mode = "undirected", weighted = TRUE, diag = FALSE)

net_l %>%
  ggraph(layout = 'kk')+
  geom_edge_link(color = "orange")+
  theme_void()+
  geom_node_point(aes(color=by_lang$Family))+
  geom_node_text(size = 2,
                aes(label = name), repel=TRUE)+
  labs(color='Family')

How many members of different families are around each village?

data_short <- data
matrix_fam <- distm(data_short[,c('Lon','Lat')], data_short[,c('Lon','Lat')], fun=distVincentyEllipsoid) / 1000
matrix_fam[matrix_fam > 100] <- 0
net_fam <- graph.adjacency(matrix_fam, mode = "undirected", weighted = TRUE, diag = FALSE)

V(net_fam)$label <- data$Family

SameLabel <-  function(e) {
    V(net_fam)[ends(net_fam, e)[1]]$label == V(net_fam)[ends(net_fam, e)[2]]$label }
g2 <- delete_edges(net_fam, which(sapply(E(net_fam), SameLabel)))

data_short$degree <- degree(g2)

ggplot(data=data_short, aes(x=Family, y=degree))+
  geom_boxplot()

  # geom_dotplot(aes(fill=Family, color=Family), binaxis = "y", stackdir = "center", position = "dodge", binwidth = 0.45)

# ggsave('degree_family.jpg', width = 7, height = 4)
data_short  %>% 
  group_by(Family)  %>% 
  summarise(mean = mean(degree))
## `summarise()` ungrouping output (override with `.groups` argument)
## # A tibble: 7 x 2
##   Family  mean
##   <fct>  <dbl>
## 1 Andic   487.
## 2 Avar    409.
## 3 Dargwa  586.
## 4 Lak     839.
## 5 Lezgic  340.
## 6 Tsezic  405.
## 7 Turkic  589.
data_itm_fam <- read.csv('itm_coords_Family.csv')
itm_vill_fam <- data_itm_fam %>% 
  group_by(index, Lat, Lon, elevation, Family, mother.tongue) %>% 
  summarise(mean_itm = mean(number.of.lang.strat), count=n())
## `summarise()` regrouping output by 'index', 'Lat', 'Lon', 'elevation', 'Family' (override with `.groups` argument)
itm_vill_fam <- itm_vill_fam[!itm_vill_fam$index == 'Hinuq',]
itm_vill_fam[itm_vill_fam$mother.tongue == 'кадарский даргинский',]$Family <- 'Dargwa'
itm_vill_fam[itm_vill_fam$mother.tongue == 'сирхинский даргинский',]$Family <- 'Dargwa'
itm_vill_fam[itm_vill_fam$mother.tongue == 'цудахарский даргинский',]$Family <- 'Dargwa'
itm_vill_fam[itm_vill_fam$mother.tongue == 'гдымско-фийский лезгинский',]$Family <- 'Lezgic'
itm_vill_fam[itm_vill_fam$mother.tongue == 'цахурский',]$Family <- 'Lezgic'
itm_vill_fam[itm_vill_fam$mother.tongue == 'азербайджанский',]$Family <- 'Turkic'
itm_vill_fam[itm_vill_fam$mother.tongue == 'ахвахский',]$Family <- 'Andic'
itm_vill_fam$Lat <- itm_vill$Lat
itm_vill_fam$Lon <- itm_vill$Lon

Villages that are closer to each other than 25kn and dont share the same mother tongue:

matrix <- distm(itm_vill_fam[,c('Lon','Lat')], itm_vill_fam[,c('Lon','Lat')], fun=distVincentyEllipsoid) / 1000
matrix[matrix > 25] <- 0
mode(matrix) <- "numeric"
rownames(matrix) <- itm_vill_fam$index
colnames(matrix) <- itm_vill_fam$index
net <- graph.adjacency(matrix, mode = "undirected", weighted = TRUE, diag = FALSE)
# V(net)$label <- itm_vill_fam$Family
V(net)$label <- itm_vill_fam$mother.tongue
SameLabel <-  function(e) {
    V(net)[ends(net, e)[1]]$label == V(net)[ends(net, e)[2]]$label }
g2 <- delete_edges(net, which(sapply(E(net), SameLabel)))

itm_vill_fam$deg <- degree(g2)

# DiffLabel <-  function(e) {
#     V(net)[ends(net, e)[1]]$label != V(net)[ends(net, e)[2]]$label }
# g3 <- delete_edges(net, which(sapply(E(net), DiffLabel)))
# 
itm_vill_fam$deg <- degree(g2)
# itm_vill_fam$deg_tot <- degree(g3)
# itm_vill_fam$deg <- log(itm_vill_fam$deg_n/itm_vill_fam$deg_tot)

g2 %>%  
  ggraph(layout = 'kk')+
  geom_edge_link(color = "orange", alpha=0.5)+
  theme_void()+
  geom_node_point(aes(color=itm_vill_fam$Family))+
  scale_color_brewer(palette="Dark2")+
  labs(color='Family')

# ggsave('village_graph.jpg', width = 7, height = 4)
ggplot(itm_vill_fam, aes(y=mean_itm, x=deg))+
  geom_point()

ggplot(itm_vill_fam, aes(x=Family, y=deg))+
  geom_boxplot()+
  labs(x='Degree')

itm_deg_m <- lmer(mean_itm ~ deg + (1|Family:mother.tongue) + count, itm_vill_fam)
summary(itm_deg_m)
## Linear mixed model fit by REML. t-tests use Satterthwaite's method [
## lmerModLmerTest]
## Formula: mean_itm ~ deg + (1 | Family:mother.tongue) + count
##    Data: itm_vill_fam
## 
## REML criterion at convergence: 54.5
## 
## Scaled residuals: 
##      Min       1Q   Median       3Q      Max 
## -1.25696 -0.55455  0.05043  0.40487  2.47483 
## 
## Random effects:
##  Groups               Name        Variance Std.Dev.
##  Family:mother.tongue (Intercept) 0.12227  0.3497  
##  Residual                         0.05217  0.2284  
## Number of obs: 49, groups:  Family:mother.tongue, 28
## 
## Fixed effects:
##              Estimate Std. Error        df t value Pr(>|t|)   
## (Intercept)  0.519597   0.175582 45.905846   2.959  0.00486 **
## deg          0.019170   0.023068 45.390793   0.831  0.41032   
## count        0.001549   0.001098 35.404594   1.412  0.16682   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##       (Intr) deg   
## deg   -0.676       
## count -0.679  0.128
pred <- ggpredict(itm_deg_m, terms = c('deg'))

ggplot()+
  geom_point(data=itm_vill_fam, aes(y=mean_itm, x=deg, color=Family))+
  geom_text_repel(data=itm_vill_fam, aes(label=index, y=mean_itm, x=deg))+
  geom_line(data=pred, aes(x=pred$x, y=predicted), size=0.8)+
  geom_ribbon(aes(ymin = pred$conf.low, ymax = pred$conf.high, x=pred$x), alpha = .15)+
  scale_color_brewer(palette="Dark2")

It seems that using the number of neighbouring villages that are speaking different languages from the given village we can predict ITM! Here the data is extracted from the bigger dataset:

overlap <- data_short[data_short$eng_vil_name %in% itm_vill_fam$index,]$eng_vil_name
ov_itm_vill_fam <- itm_vill_fam[itm_vill_fam$index %in% overlap,]
data_all <- data_short[data_short$eng_vil_name %in% itm_vill_fam$index,]
itm_vill_fam_m <- merge(x= ov_itm_vill_fam, y = data_all[, c('eng_vil_name', 'degree')], by.x='index', by.y = 'eng_vil_name', all=TRUE, copy=TRUE)

itm_deg <- lmer(mean_itm ~ degree + (1|Family:mother.tongue) + count, itm_vill_fam_m)
summary(itm_deg)
## Linear mixed model fit by REML. t-tests use Satterthwaite's method [
## lmerModLmerTest]
## Formula: mean_itm ~ degree + (1 | Family:mother.tongue) + count
##    Data: itm_vill_fam_m
## 
## REML criterion at convergence: 60.5
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -1.7488 -0.5158  0.2256  0.5314  1.9616 
## 
## Random effects:
##  Groups               Name        Variance Std.Dev.
##  Family:mother.tongue (Intercept) 0.06150  0.2480  
##  Residual                         0.09047  0.3008  
## Number of obs: 41, groups:  Family:mother.tongue, 25
## 
## Fixed effects:
##               Estimate Std. Error         df t value Pr(>|t|)    
## (Intercept)  1.0701453  0.2604493 28.9692203   4.109 0.000298 ***
## degree      -0.0009389  0.0004361 23.1926888  -2.153 0.041951 *  
## count        0.0016051  0.0012567 35.5238115   1.277 0.209824    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##        (Intr) degree
## degree -0.841       
## count  -0.442 -0.026
pred <- ggpredict(itm_deg, terms = c('degree'))

ggplot()+
  geom_point(data=itm_vill_fam_m, aes(y=mean_itm, x=degree, color=Family))+
  geom_text_repel(data=itm_vill_fam_m, aes(label=index, y=mean_itm, x=degree))+
  geom_line(data=pred, aes(x=pred$x, y=predicted), size=0.8)+
  geom_ribbon(aes(ymin = pred$conf.low, ymax = pred$conf.high, x=pred$x), alpha = .15)+
  scale_color_brewer(palette="Dark2")